Setup

You are to register for Kaggle.com (free) and compete in the House Prices: Advanced Regression Techniques competition. https://www.kaggle.com/c/house-prices-advanced-regression-techniques . I want you to do the following.

Problem 1 - Descriptive and Inferential Statistics.

Provide univariate descriptive statistics and appropriate plots for the training data set.

summary(training_data)
##        id          ms_sub_class    ms_zoning          lot_frontage   
##  Min.   :   1.0   Min.   : 20.0   Length:1460        Min.   : 21.00  
##  1st Qu.: 365.8   1st Qu.: 20.0   Class :character   1st Qu.: 59.00  
##  Median : 730.5   Median : 50.0   Mode  :character   Median : 69.00  
##  Mean   : 730.5   Mean   : 56.9                      Mean   : 70.05  
##  3rd Qu.:1095.2   3rd Qu.: 70.0                      3rd Qu.: 80.00  
##  Max.   :1460.0   Max.   :190.0                      Max.   :313.00  
##                                                      NA's   :259     
##     lot_area         street             alley            lot_shape        
##  Min.   :  1300   Length:1460        Length:1460        Length:1460       
##  1st Qu.:  7554   Class :character   Class :character   Class :character  
##  Median :  9478   Mode  :character   Mode  :character   Mode  :character  
##  Mean   : 10517                                                           
##  3rd Qu.: 11602                                                           
##  Max.   :215245                                                           
##                                                                           
##  land_contour        utilities          lot_config         land_slope       
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  neighborhood        condition1         condition2         bldg_type        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  house_style         overall_qual     overall_cond     year_built  
##  Length:1460        Min.   : 1.000   Min.   :1.000   Min.   :1872  
##  Class :character   1st Qu.: 5.000   1st Qu.:5.000   1st Qu.:1954  
##  Mode  :character   Median : 6.000   Median :5.000   Median :1973  
##                     Mean   : 6.099   Mean   :5.575   Mean   :1971  
##                     3rd Qu.: 7.000   3rd Qu.:6.000   3rd Qu.:2000  
##                     Max.   :10.000   Max.   :9.000   Max.   :2010  
##                                                                    
##  year_remod_add  roof_style         roof_matl         exterior1st       
##  Min.   :1950   Length:1460        Length:1460        Length:1460       
##  1st Qu.:1967   Class :character   Class :character   Class :character  
##  Median :1994   Mode  :character   Mode  :character   Mode  :character  
##  Mean   :1985                                                           
##  3rd Qu.:2004                                                           
##  Max.   :2010                                                           
##                                                                         
##  exterior2nd        mas_vnr_type        mas_vnr_area     exter_qual       
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median :   0.0   Mode  :character  
##                                        Mean   : 103.7                     
##                                        3rd Qu.: 166.0                     
##                                        Max.   :1600.0                     
##                                        NA's   :8                          
##   exter_cond         foundation         bsmt_qual          bsmt_cond        
##  Length:1460        Length:1460        Length:1460        Length:1460       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##  bsmt_exposure      bsmt_fin_type1      bsmt_fin_sf1    bsmt_fin_type2    
##  Length:1460        Length:1460        Min.   :   0.0   Length:1460       
##  Class :character   Class :character   1st Qu.:   0.0   Class :character  
##  Mode  :character   Mode  :character   Median : 383.5   Mode  :character  
##                                        Mean   : 443.6                     
##                                        3rd Qu.: 712.2                     
##                                        Max.   :5644.0                     
##                                                                           
##   bsmt_fin_sf2      bsmt_unf_sf     total_bsmt_sf      heating         
##  Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Length:1460       
##  1st Qu.:   0.00   1st Qu.: 223.0   1st Qu.: 795.8   Class :character  
##  Median :   0.00   Median : 477.5   Median : 991.5   Mode  :character  
##  Mean   :  46.55   Mean   : 567.2   Mean   :1057.4                     
##  3rd Qu.:   0.00   3rd Qu.: 808.0   3rd Qu.:1298.2                     
##  Max.   :1474.00   Max.   :2336.0   Max.   :6110.0                     
##                                                                        
##   heating_qc        central_air         electrical         x1st_flr_sf  
##  Length:1460        Length:1460        Length:1460        Min.   : 334  
##  Class :character   Class :character   Class :character   1st Qu.: 882  
##  Mode  :character   Mode  :character   Mode  :character   Median :1087  
##                                                           Mean   :1163  
##                                                           3rd Qu.:1391  
##                                                           Max.   :4692  
##                                                                         
##   x2nd_flr_sf   low_qual_fin_sf    gr_liv_area   bsmt_full_bath  
##  Min.   :   0   Min.   :  0.000   Min.   : 334   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.:  0.000   1st Qu.:1130   1st Qu.:0.0000  
##  Median :   0   Median :  0.000   Median :1464   Median :0.0000  
##  Mean   : 347   Mean   :  5.845   Mean   :1515   Mean   :0.4253  
##  3rd Qu.: 728   3rd Qu.:  0.000   3rd Qu.:1777   3rd Qu.:1.0000  
##  Max.   :2065   Max.   :572.000   Max.   :5642   Max.   :3.0000  
##                                                                  
##  bsmt_half_bath      full_bath       half_bath      bedroom_abv_gr 
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :0.00000   Median :2.000   Median :0.0000   Median :3.000  
##  Mean   :0.05753   Mean   :1.565   Mean   :0.3829   Mean   :2.866  
##  3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :8.000  
##                                                                    
##  kitchen_abv_gr  kitchen_qual       tot_rms_abv_grd   functional       
##  Min.   :0.000   Length:1460        Min.   : 2.000   Length:1460       
##  1st Qu.:1.000   Class :character   1st Qu.: 5.000   Class :character  
##  Median :1.000   Mode  :character   Median : 6.000   Mode  :character  
##  Mean   :1.047                      Mean   : 6.518                     
##  3rd Qu.:1.000                      3rd Qu.: 7.000                     
##  Max.   :3.000                      Max.   :14.000                     
##                                                                        
##    fireplaces    fireplace_qu       garage_type        garage_yr_blt 
##  Min.   :0.000   Length:1460        Length:1460        Min.   :1900  
##  1st Qu.:0.000   Class :character   Class :character   1st Qu.:1961  
##  Median :1.000   Mode  :character   Mode  :character   Median :1980  
##  Mean   :0.613                                         Mean   :1979  
##  3rd Qu.:1.000                                         3rd Qu.:2002  
##  Max.   :3.000                                         Max.   :2010  
##                                                        NA's   :81    
##  garage_finish       garage_cars     garage_area     garage_qual       
##  Length:1460        Min.   :0.000   Min.   :   0.0   Length:1460       
##  Class :character   1st Qu.:1.000   1st Qu.: 334.5   Class :character  
##  Mode  :character   Median :2.000   Median : 480.0   Mode  :character  
##                     Mean   :1.767   Mean   : 473.0                     
##                     3rd Qu.:2.000   3rd Qu.: 576.0                     
##                     Max.   :4.000   Max.   :1418.0                     
##                                                                        
##  garage_cond        paved_drive         wood_deck_sf    open_porch_sf   
##  Length:1460        Length:1460        Min.   :  0.00   Min.   :  0.00  
##  Class :character   Class :character   1st Qu.:  0.00   1st Qu.:  0.00  
##  Mode  :character   Mode  :character   Median :  0.00   Median : 25.00  
##                                        Mean   : 94.24   Mean   : 46.66  
##                                        3rd Qu.:168.00   3rd Qu.: 68.00  
##                                        Max.   :857.00   Max.   :547.00  
##                                                                         
##  enclosed_porch    x3ssn_porch      screen_porch      pool_area      
##  Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.000  
##  1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.00   1st Qu.:  0.000  
##  Median :  0.00   Median :  0.00   Median :  0.00   Median :  0.000  
##  Mean   : 21.95   Mean   :  3.41   Mean   : 15.06   Mean   :  2.759  
##  3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.00   3rd Qu.:  0.000  
##  Max.   :552.00   Max.   :508.00   Max.   :480.00   Max.   :738.000  
##                                                                      
##    pool_qc             fence           misc_feature          misc_val       
##  Length:1460        Length:1460        Length:1460        Min.   :    0.00  
##  Class :character   Class :character   Class :character   1st Qu.:    0.00  
##  Mode  :character   Mode  :character   Mode  :character   Median :    0.00  
##                                                           Mean   :   43.49  
##                                                           3rd Qu.:    0.00  
##                                                           Max.   :15500.00  
##                                                                             
##     mo_sold          yr_sold      sale_type         sale_condition    
##  Min.   : 1.000   Min.   :2006   Length:1460        Length:1460       
##  1st Qu.: 5.000   1st Qu.:2007   Class :character   Class :character  
##  Median : 6.000   Median :2008   Mode  :character   Mode  :character  
##  Mean   : 6.322   Mean   :2008                                        
##  3rd Qu.: 8.000   3rd Qu.:2009                                        
##  Max.   :12.000   Max.   :2010                                        
##                                                                       
##    sale_price    
##  Min.   : 34900  
##  1st Qu.:129975  
##  Median :163000  
##  Mean   :180921  
##  3rd Qu.:214000  
##  Max.   :755000  
## 
hist(training_data$overall_qual)

plot(training_data$overall_qual, training_data$sale_price)

hist(training_data$year_built)

plot(training_data$year_built, training_data$sale_price)

hist(training_data$year_remod_add)

plot(training_data$year_remod_add, training_data$sale_price)

hist(training_data$bsmt_fin_sf1)

plot(training_data$bsmt_fin_sf1, training_data$sale_price)

hist(training_data$total_bsmt_sf)

plot(training_data$total_bsmt_sf, training_data$sale_price)

hist(training_data$x1st_flr_sf)

plot(training_data$x1st_flr_sf, training_data$sale_price)

hist(training_data$x2nd_flr_sf)

plot(training_data$x2nd_flr_sf, training_data$sale_price)

hist(training_data$gr_liv_area)

plot(training_data$gr_liv_area, training_data$sale_price)

hist(training_data$full_bath)

plot(training_data$full_bath, training_data$sale_price)

hist(training_data$tot_rms_abv_grd)

plot(training_data$tot_rms_abv_grd, training_data$sale_price)

hist(training_data$fireplaces)

plot(training_data$fireplaces, training_data$sale_price)

hist(training_data$garage_cars)

plot(training_data$garage_cars, training_data$sale_price)

hist(training_data$garage_area)

plot(training_data$garage_area, training_data$sale_price)

hist(training_data$wood_deck_sf)

plot(training_data$wood_deck_sf, training_data$sale_price)

hist(training_data$open_porch_sf)

plot(training_data$open_porch_sf, training_data$sale_price)

training_data %>% select_if(., is.character) %>% str()
## tibble [1,460 × 43] (S3: tbl_df/tbl/data.frame)
##  $ ms_zoning     : chr [1:1460] "RL" "RL" "RL" "RL" ...
##  $ street        : chr [1:1460] "Pave" "Pave" "Pave" "Pave" ...
##  $ alley         : chr [1:1460] NA NA NA NA ...
##  $ lot_shape     : chr [1:1460] "Reg" "Reg" "IR1" "IR1" ...
##  $ land_contour  : chr [1:1460] "Lvl" "Lvl" "Lvl" "Lvl" ...
##  $ utilities     : chr [1:1460] "AllPub" "AllPub" "AllPub" "AllPub" ...
##  $ lot_config    : chr [1:1460] "Inside" "FR2" "Inside" "Corner" ...
##  $ land_slope    : chr [1:1460] "Gtl" "Gtl" "Gtl" "Gtl" ...
##  $ neighborhood  : chr [1:1460] "CollgCr" "Veenker" "CollgCr" "Crawfor" ...
##  $ condition1    : chr [1:1460] "Norm" "Feedr" "Norm" "Norm" ...
##  $ condition2    : chr [1:1460] "Norm" "Norm" "Norm" "Norm" ...
##  $ bldg_type     : chr [1:1460] "1Fam" "1Fam" "1Fam" "1Fam" ...
##  $ house_style   : chr [1:1460] "2Story" "1Story" "2Story" "2Story" ...
##  $ roof_style    : chr [1:1460] "Gable" "Gable" "Gable" "Gable" ...
##  $ roof_matl     : chr [1:1460] "CompShg" "CompShg" "CompShg" "CompShg" ...
##  $ exterior1st   : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Sdng" ...
##  $ exterior2nd   : chr [1:1460] "VinylSd" "MetalSd" "VinylSd" "Wd Shng" ...
##  $ mas_vnr_type  : chr [1:1460] "BrkFace" "None" "BrkFace" "None" ...
##  $ exter_qual    : chr [1:1460] "Gd" "TA" "Gd" "TA" ...
##  $ exter_cond    : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ foundation    : chr [1:1460] "PConc" "CBlock" "PConc" "BrkTil" ...
##  $ bsmt_qual     : chr [1:1460] "Gd" "Gd" "Gd" "TA" ...
##  $ bsmt_cond     : chr [1:1460] "TA" "TA" "TA" "Gd" ...
##  $ bsmt_exposure : chr [1:1460] "No" "Gd" "Mn" "No" ...
##  $ bsmt_fin_type1: chr [1:1460] "GLQ" "ALQ" "GLQ" "ALQ" ...
##  $ bsmt_fin_type2: chr [1:1460] "Unf" "Unf" "Unf" "Unf" ...
##  $ heating       : chr [1:1460] "GasA" "GasA" "GasA" "GasA" ...
##  $ heating_qc    : chr [1:1460] "Ex" "Ex" "Ex" "Gd" ...
##  $ central_air   : chr [1:1460] "Y" "Y" "Y" "Y" ...
##  $ electrical    : chr [1:1460] "SBrkr" "SBrkr" "SBrkr" "SBrkr" ...
##  $ kitchen_qual  : chr [1:1460] "Gd" "TA" "Gd" "Gd" ...
##  $ functional    : chr [1:1460] "Typ" "Typ" "Typ" "Typ" ...
##  $ fireplace_qu  : chr [1:1460] NA "TA" "TA" "Gd" ...
##  $ garage_type   : chr [1:1460] "Attchd" "Attchd" "Attchd" "Detchd" ...
##  $ garage_finish : chr [1:1460] "RFn" "RFn" "RFn" "Unf" ...
##  $ garage_qual   : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ garage_cond   : chr [1:1460] "TA" "TA" "TA" "TA" ...
##  $ paved_drive   : chr [1:1460] "Y" "Y" "Y" "Y" ...
##  $ pool_qc       : chr [1:1460] NA NA NA NA ...
##  $ fence         : chr [1:1460] NA NA NA NA ...
##  $ misc_feature  : chr [1:1460] NA NA NA NA ...
##  $ sale_type     : chr [1:1460] "WD" "WD" "WD" "WD" ...
##  $ sale_condition: chr [1:1460] "Normal" "Normal" "Normal" "Abnorml" ...
ggplot(training_data) +
  geom_boxplot(aes(x=ms_zoning, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=street, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=lot_shape, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=land_contour, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=utilities, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=lot_config, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=land_slope, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=neighborhood, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=condition1, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=condition2, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=bldg_type, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=house_style, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=roof_style, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=exterior1st, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=exterior2nd, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=exter_qual, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=exter_cond, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=heating, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=heating_qc, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=central_air, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=electrical, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=kitchen_qual, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=functional, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=garage_type, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=garage_finish, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=sale_type, y=sale_price))

ggplot(training_data) +
  geom_boxplot(aes(x=sale_condition, y=sale_price))

Provide a scatterplot matrix for at least two of the independent variables and the dependent variable.

d1 <- training_data %>% 
  select(sale_price, garage_area, x1st_flr_sf)

plot(d1, pch=20 , cex=1.5 , col="purple")

Derive a correlation matrix for any three quantitative variables in the dataset.

num_items = ncol(d1)

cor_data <- c()

for(i in seq(num_items)) {
  for(j in seq(num_items)) {
    cor_data <- c(cor_data,cor(d1[[i]],d1[[j]]))
    
  }
}

corr_matrix <- matrix(cor_data, ncol=num_items, nrow=num_items)

Test the hypotheses that the correlations between each pairwise set of variables is 0 and provide an 80% confidence interval.

cor.test(d1$sale_price, d1$garage_area, conf.level=0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  d1$sale_price and d1$garage_area
## t = 30.446, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.6024756 0.6435283
## sample estimates:
##       cor 
## 0.6234314
cor.test(d1$sale_price, d1$x1st_flr_sf, conf.level=0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  d1$sale_price and d1$x1st_flr_sf
## t = 29.078, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.5841687 0.6266715
## sample estimates:
##       cor 
## 0.6058522
cor.test(d1$garage_area, d1$x1st_flr_sf, conf.level=0.80)
## 
##  Pearson's product-moment correlation
## 
## data:  d1$garage_area and d1$x1st_flr_sf
## t = 21.451, df = 1458, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 80 percent confidence interval:
##  0.4638446 0.5148798
## sample estimates:
##       cor 
## 0.4897817

Discuss the meaning of your analysis.

Would you be worried about familywise error? Why or why not?

Problem 2 - Linear Algebra and Correlation.

prec_matrix <- solve(corr_matrix)

cp <- corr_matrix %*% prec_matrix
pc <- prec_matrix %*% corr_matrix

lu_decomp <- lu.decomposition(corr_matrix)

l <- lu_decomp$L
u <- lu_decomp$U

lu <- l %*% u

corr_matrix == lu
##      [,1] [,2] [,3]
## [1,] TRUE TRUE TRUE
## [2,] TRUE TRUE TRUE
## [3,] TRUE TRUE TRUE

Problem 3 - Calculus-Based Probability & Statistics.

Many times, it makes sense to fit a closed form distribution to data.
- Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.
- Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html).
- Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))).
- Plot a histogram and compare it with a histogram of your original variable.
- Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF). - Also generate a 95% confidence interval from the empirical data, assuming normality.
- Finally, provide the empirical 5th percentile and 95th percentile of the data.
- Discuss.

Select a variable in the Kaggle.com training dataset that is skewed to the right, shift it so that the minimum value is absolutely above zero if necessary.

This variable has a minimum value of 1300

ggplot(training_data) +
  geom_histogram(aes(x=lot_area), fill='white', color='black', bins=50)

lot_area <- training_data %>% select(lot_area)

summary(lot_area)
##     lot_area     
##  Min.   :  1300  
##  1st Qu.:  7554  
##  Median :  9478  
##  Mean   : 10517  
##  3rd Qu.: 11602  
##  Max.   :215245

Then load the MASS package and run fitdistr to fit an exponential probability density function. (See https://stat.ethz.ch/R-manual/R-devel/library/MASS/html/fitdistr.html ).

lot_area_dist <- MASS::fitdistr(lot_area$lot_area, densfun = 'exponential')

Find the optimal value of \(\lambda\) for this distribution, and then take 1000 samples from this exponential distribution using this value (e.g., rexp(1000, \(\lambda\))).

lambda_opt <- lot_area_dist$estimate

set.seed(51323)

sample_dist <- rexp(1000,lambda_opt)

Plot a histogram and compare it with a histogram of your original variable.

hist(training_data$lot_area)

hist(sample_dist)

Using the exponential pdf, find the 5th and 95th percentiles using the cumulative distribution function (CDF).

qexp(.05, lambda_opt)
## [1] 539.4428
qexp(.95, lambda_opt)
## [1] 31505.6

Also generate a 95% confidence interval from the empirical data, assuming normality.

x <- mean(lot_area$lot_area)
std_dev <- sd(lot_area$lot_area)
n <- length(lot_area$lot_area)
z_score <- qnorm(.975)

(lower_limit <- x - (z_score*(std_dev/sqrt(n))))
## [1] 10004.84
(upper_limit <- x+(z_score*(std_dev/sqrt(n))))
## [1] 11028.81

Finally, provide the empirical 5th percentile and 95th percentile of the data.

quantile(lot_area$lot_area, probs=c(0.05, 0.95))
##       5%      95% 
##  3311.70 17401.15

Discuss.

Problem 4 - Modeling

Build some type of multiple regression model and submit your model to the competition board.

sp_quantiles <- quantile(training_data$sale_price)
sp_quantiles
##     0%    25%    50%    75%   100% 
##  34900 129975 163000 214000 755000
IQR <- sp_quantiles[[4]]-sp_quantiles[[2]]
upper_limit <- sp_quantiles[[4]] + 1.5*IQR
lower_limit <- sp_quantiles[[2]] - 1.5*IQR


training_adj <- training_data %>% filter(between(sale_price, lower_limit, upper_limit))
training_corr <- cor(training_adj %>% select_if(is.numeric))

training_corr <- round(training_corr, digits=2)

sales_corr <- training_corr[,'sale_price']

sales_corr
##              id    ms_sub_class    lot_frontage        lot_area    overall_qual 
##           -0.01           -0.06              NA            0.25            0.78 
##    overall_cond      year_built  year_remod_add    mas_vnr_area    bsmt_fin_sf1 
##           -0.05            0.56            0.54              NA            0.29 
##    bsmt_fin_sf2     bsmt_unf_sf   total_bsmt_sf     x1st_flr_sf     x2nd_flr_sf 
##            0.01            0.22            0.54            0.52            0.32 
## low_qual_fin_sf     gr_liv_area  bsmt_full_bath  bsmt_half_bath       full_bath 
##           -0.06            0.66            0.20           -0.02            0.58 
##       half_bath  bedroom_abv_gr  kitchen_abv_gr tot_rms_abv_grd      fireplaces 
##            0.28            0.20           -0.15            0.47            0.45 
##   garage_yr_blt     garage_cars     garage_area    wood_deck_sf   open_porch_sf 
##              NA            0.63            0.61            0.30            0.33 
##  enclosed_porch     x3ssn_porch    screen_porch       pool_area        misc_val 
##           -0.14            0.04            0.11            0.05           -0.01 
##         mo_sold         yr_sold      sale_price 
##            0.07           -0.03            1.00

Build Model

training_adj <- training_adj %>% mutate(overall_qual = as.factor(overall_qual))
training_adj <- training_adj %>% mutate(garage_cars = as.factor(garage_cars))
training_adj <- training_adj %>% mutate(year_built_cat = case_when(
  year_built < 1980~'pre_1980',
  TRUE ~ 'post_1980'
)) 
training_adj <- training_adj %>% mutate(full_bath = as.factor(full_bath))
training_adj <- training_adj %>% mutate(tot_rms_abv_grd = as.factor(tot_rms_abv_grd))
training_adj <- training_adj %>% mutate(has_2nd_flr = ifelse(x2nd_flr_sf == 0,'no','yes')) 


m3 <- lm(sale_price ~ overall_qual + log(gr_liv_area) + garage_cars + 
           year_built_cat + full_bath + total_bsmt_sf + x1st_flr_sf +
           (has_2nd_flr)*(x2nd_flr_sf) + tot_rms_abv_grd + 
           kitchen_qual + electrical + central_air + exter_qual +
           exter_cond + heating + sale_type, 
         data=training_adj)

Provide your complete model summary and results with analysis.

summary(m3)
## 
## Call:
## lm(formula = sale_price ~ overall_qual + log(gr_liv_area) + garage_cars + 
##     year_built_cat + full_bath + total_bsmt_sf + x1st_flr_sf + 
##     (has_2nd_flr) * (x2nd_flr_sf) + tot_rms_abv_grd + kitchen_qual + 
##     electrical + central_air + exter_qual + exter_cond + heating + 
##     sale_type, data = training_adj)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -148847  -13056     290   12539  123454 
## 
## Coefficients: (1 not defined because of singularities)
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                -2.434e+05  7.864e+04  -3.095 0.002007 ** 
## overall_qual2               1.004e+04  3.089e+04   0.325 0.745186    
## overall_qual3              -1.981e+04  2.908e+04  -0.681 0.495972    
## overall_qual4              -7.521e+03  2.857e+04  -0.263 0.792395    
## overall_qual5              -8.122e+02  2.872e+04  -0.028 0.977446    
## overall_qual6               7.544e+03  2.877e+04   0.262 0.793180    
## overall_qual7               2.097e+04  2.881e+04   0.728 0.466728    
## overall_qual8               4.521e+04  2.895e+04   1.562 0.118635    
## overall_qual9               7.421e+04  2.969e+04   2.499 0.012558 *  
## overall_qual10             -3.258e+04  3.239e+04  -1.006 0.314707    
## log(gr_liv_area)            7.083e+04  1.194e+04   5.933 3.79e-09 ***
## garage_cars1                9.305e+03  3.376e+03   2.756 0.005923 ** 
## garage_cars2                1.756e+04  3.425e+03   5.128 3.35e-07 ***
## garage_cars3                3.282e+04  4.390e+03   7.477 1.37e-13 ***
## garage_cars4                3.913e+04  1.187e+04   3.295 0.001009 ** 
## year_built_catpre_1980     -6.168e+03  2.413e+03  -2.556 0.010693 *  
## full_bath1                 -4.545e+03  9.852e+03  -0.461 0.644662    
## full_bath2                 -5.593e+03  9.897e+03  -0.565 0.572066    
## full_bath3                  2.548e+03  1.165e+04   0.219 0.826923    
## total_bsmt_sf               1.528e+01  3.045e+00   5.018 5.92e-07 ***
## x1st_flr_sf                 3.273e+00  8.520e+00   0.384 0.700911    
## has_2nd_flryes             -3.513e+04  4.285e+03  -8.199 5.63e-16 ***
## x2nd_flr_sf                 3.997e+01  8.177e+00   4.888 1.14e-06 ***
## tot_rms_abv_grd3           -6.355e+04  3.801e+04  -1.672 0.094784 .  
## tot_rms_abv_grd4           -6.679e+04  3.766e+04  -1.774 0.076336 .  
## tot_rms_abv_grd5           -6.785e+04  3.782e+04  -1.794 0.073065 .  
## tot_rms_abv_grd6           -7.054e+04  3.798e+04  -1.858 0.063442 .  
## tot_rms_abv_grd7           -7.113e+04  3.806e+04  -1.869 0.061884 .  
## tot_rms_abv_grd8           -7.442e+04  3.812e+04  -1.952 0.051134 .  
## tot_rms_abv_grd9           -6.817e+04  3.822e+04  -1.784 0.074692 .  
## tot_rms_abv_grd10          -7.737e+04  3.845e+04  -2.012 0.044384 *  
## tot_rms_abv_grd11          -1.031e+05  3.898e+04  -2.646 0.008250 ** 
## tot_rms_abv_grd12          -1.264e+05  3.931e+04  -3.216 0.001333 ** 
## tot_rms_abv_grd14          -8.526e+04  4.724e+04  -1.805 0.071326 .  
## kitchen_qualFa             -2.712e+04  6.390e+03  -4.244 2.35e-05 ***
## kitchen_qualGd             -1.677e+04  3.985e+03  -4.208 2.75e-05 ***
## kitchen_qualTA             -2.612e+04  4.228e+03  -6.178 8.61e-10 ***
## electricalFuseF            -1.119e+03  5.804e+03  -0.193 0.847124    
## electricalFuseP            -6.052e+03  1.528e+04  -0.396 0.692074    
## electricalMix              -4.419e+04  2.562e+04  -1.725 0.084765 .  
## electricalSBrkr             6.404e+03  2.897e+03   2.211 0.027234 *  
## central_airY                1.721e+04  3.526e+03   4.883 1.17e-06 ***
## exter_qualFa               -3.331e+04  1.083e+04  -3.075 0.002150 ** 
## exter_qualGd               -1.188e+04  6.669e+03  -1.782 0.074982 .  
## exter_qualTA               -1.874e+04  6.930e+03  -2.704 0.006932 ** 
## exter_condFa               -4.960e+04  1.618e+04  -3.065 0.002221 ** 
## exter_condGd               -3.739e+04  1.541e+04  -2.427 0.015368 *  
## exter_condPo               -7.224e+04  3.035e+04  -2.381 0.017420 *  
## exter_condTA               -4.076e+04  1.528e+04  -2.668 0.007721 ** 
## heatingGasA                -2.290e+04  2.603e+04  -0.879 0.379307    
## heatingGasW                -3.121e+04  2.663e+04  -1.172 0.241532    
## heatingGrav                -1.441e+04  2.768e+04  -0.520 0.602827    
## heatingOthW                -8.464e+04  3.157e+04  -2.681 0.007427 ** 
## heatingWall                -1.114e+04  2.850e+04  -0.391 0.695874    
## sale_typeCon                6.544e+04  1.813e+04   3.610 0.000318 ***
## sale_typeConLD              9.236e+03  9.600e+03   0.962 0.336171    
## sale_typeConLI             -7.055e+03  1.314e+04  -0.537 0.591424    
## sale_typeConLw              1.263e+04  1.240e+04   1.018 0.308646    
## sale_typeCWD                2.814e+04  1.337e+04   2.105 0.035485 *  
## sale_typeNew                1.595e+04  4.912e+03   3.247 0.001197 ** 
## sale_typeOth                1.533e+04  1.514e+04   1.012 0.311608    
## sale_typeWD                 1.479e+04  3.931e+03   3.763 0.000175 ***
## has_2nd_flryes:x2nd_flr_sf         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24850 on 1336 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.8317, Adjusted R-squared:  0.8241 
## F-statistic: 108.3 on 61 and 1336 DF,  p-value: < 2.2e-16
par(mfrow=c(1,1))
plot(m3)
## Warning: not plotting observations with leverage one:
##   241, 359, 380, 508, 607, 1265

hist(resid(m3))

Predict on Test Data and submit Results to Kaggle

test_data <- read_delim('/Users/korymartin/Library/Mobile Documents/com~apple~CloudDocs/Grad Programs/CUNY SPS/DATA 605/Final Exam/train.csv')
## Rows: 1460 Columns: 81
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (43): MSZoning, Street, Alley, LotShape, LandContour, Utilities, LotConf...
## dbl (38): Id, MSSubClass, LotFrontage, LotArea, OverallQual, OverallCond, Ye...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test_data <- clean_names(test_data)

test_data <- test_data %>% mutate(overall_qual = as.factor(overall_qual))
test_data <- test_data %>% mutate(garage_cars = as.factor(garage_cars))
test_data <- test_data %>% mutate(year_built_cat = case_when(
  year_built < 1980~'pre_1980',
  TRUE ~ 'post_1980'
)) 
test_data <- test_data %>% mutate(full_bath = as.factor(full_bath))
test_data <- test_data %>% mutate(tot_rms_abv_grd = as.factor(tot_rms_abv_grd))
test_data <- test_data %>% mutate(has_2nd_flr = ifelse(x2nd_flr_sf == 0,'no','yes'))

predict.y <- predict(m3, test_data)
## Warning in predict.lm(m3, test_data): prediction from a rank-deficient fit may
## be misleading
actual.y <- test_data$sale_price

test_data2 <- tibble(cbind(test_data, data.frame(predict.y)))

seq_count <- seq(1459)

test_data2 <- test_data2 %>% select(id, predict.y) %>% rename(SalePrice = predict.y, Id = id)
test_data2 <- test_data2 %>% drop_na(SalePrice)
test_data2 <- cbind(test_data2, seq_count)
test_data2 <- tibble(test_data2)
test_data2 <- test_data2 %>% mutate(Id = (seq_count+1460)) 
test_data2 <- test_data2 %>% select(Id, SalePrice)

write_csv(test_data2, './test_data2.csv')

Report your Kaggle.com user name and score. Provide a screen snapshot of your score with your name identifiable.